Kiva - overview tags¶
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
import numpy as np
import pandas as pd
import cudf
import plotly.express as px
from tqdm import tqdm
tqdm.pandas()
Import raw data¶
First, read data in .jsonl file format as a pandas data frame
Then store the dataframe in .parquet format for easy access later
# ds = cudf.read_parquet("../fulldata/kiva_2023-08-10T17-57-12.parquet")
ds = cudf.read_parquet("../fulldata/kiva_2023-08-20T16-16-43.parquet")
ds.isna().all(axis=1).sum()
24
ds.dropna(axis=0, how="all", inplace=True)
store the name of interesting columns for easy access
class COL:
LOAN_AMOUNT = "loanAmount"
FUNDED_AMOUNT = "loanFundraisingInfo.fundedAmount"
RAISED_DATE = "raisedDate"
POSTED_DATE = "fundraisingDate"
TAGS = "tags"
COUNTRY_NAME = "geocode.country.name"
COUNTRY = "geocode.country.isoCode"
REGION = "geocode.country.region"
STATE = "geocode.state"
LAT = "geocode.latitude"
LONG = "geocode.longitude"
SPEED = "collection_speed"
ds.columns
Index(['anonymizationLevel', 'borrowerCount', 'borrowers', 'dafEligible',
'delinquent', 'description', 'descriptionInOriginalLanguage',
'disbursalDate', 'distributionModel', 'fundraisingDate', 'gender',
'hasCurrencyExchangeLossLenders', 'id', 'isMatchable', 'inPfp',
'loanAmount', 'lenderRepaymentTerm', 'matcherAccountId', 'matcherName',
'matchRatio', 'matchingText', 'name', 'minNoteSize', 'paidAmount',
'pfpMinLenders', 'plannedExpirationDate', 'previousLoanId',
'raisedDate', 'researchScore', 'repaymentInterval', 'status', 'tags',
'use', 'video', 'whySpecial', 'activity.id', 'activity.name',
'endorser.id', 'endorser.image.id', 'endorser.image.url',
'endorser.inviteeCount', 'endorser.lenderPage.city',
'endorser.lenderPage.state', 'endorser.lenderPage.country.name',
'endorser.lenderPage.country.isoCode',
'endorser.lenderPage.country.region', 'endorser.lenderPage.country.ppp',
'endorser.lenderPage.country.numLoansFundraising',
'endorser.lenderPage.country.fundsLentInCountry',
'endorser.lenderPage.loanBecause', 'endorser.lenderPage.occupation',
'endorser.lenderPage.otherInfo', 'endorser.lenderPage.url',
'endorser.lenderPage.whereabouts', 'endorser.loanCount',
'endorser.memberSince', 'endorser.name', 'endorser.publicId',
'geocode.city', 'geocode.state', 'geocode.country.name',
'geocode.country.isoCode', 'geocode.country.region',
'geocode.country.ppp', 'geocode.country.numLoansFundraising',
'geocode.country.fundsLentInCountry', 'geocode.postalCode',
'geocode.latitude', 'geocode.longitude', 'image.id', 'image.url',
'loanFundraisingInfo.fundedAmount',
'loanFundraisingInfo.isExpiringSoon',
'loanFundraisingInfo.reservedAmount', 'originalLanguage.id',
'originalLanguage.isActive', 'originalLanguage.isoCode',
'originalLanguage.name', 'sector.id', 'sector.name', 'terms.currency',
'terms.currencyFullName', 'terms.disbursalAmount',
'terms.disbursalDate', 'terms.expectedPayments', 'terms.loanAmount',
'terms.lenderRepaymentTerm', 'terms.lossLiabilityCurrencyExchange',
'terms.lossLiabilityNonpayment', 'terms.flexibleFundraisingEnabled',
'userProperties.favorited', 'userProperties.lentTo',
'userProperties.subscribed', 'userProperties.promoEligible',
'userProperties.amountInBasket', 'endorser.lenderPage.country',
'endorser', 'video.thumbnailImageId', 'video.youtubeId'],
dtype='object')
ds.index.duplicated().sum()
array(0)
ds = ds[
[
COL.LOAN_AMOUNT,
COL.FUNDED_AMOUNT,
COL.RAISED_DATE,
COL.POSTED_DATE,
# "disbursalDate",
COL.COUNTRY_NAME,
COL.COUNTRY,
COL.STATE,
COL.REGION,
COL.LAT,
COL.LONG,
COL.TAGS,
]
]
ds.tail()
| loanAmount | loanFundraisingInfo.fundedAmount | raisedDate | fundraisingDate | geocode.country.name | geocode.country.isoCode | geocode.state | geocode.country.region | geocode.latitude | geocode.longitude | tags | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 2564682 | 600.00 | 600.00 | 2006-07-29T15:12:14Z | 2006-07-27T18:06:53Z | Honduras | HN | El Paraiso | Central America | 14.083333 | -86.500000 | [] |
| 2564683 | 650.00 | 650.00 | 2006-07-29T19:53:13Z | 2006-07-27T17:00:10Z | Honduras | HN | El Paraiso | Central America | 14.033333 | -86.583333 | [] |
| 2564684 | 150.00 | 150.00 | 2006-07-27T13:41:46Z | 2006-07-27T06:02:07Z | Kenya | KE | Kiambu | Africa | -1.166667 | 36.833333 | [] |
| 2564685 | 225.00 | 225.00 | 2006-07-28T12:54:18Z | 2006-07-27T01:24:27Z | Kenya | KE | Kiambu | Africa | -1.166667 | 36.833333 | [] |
| 2564686 | 150.00 | 150.00 | 2006-07-27T04:44:24Z | 2006-07-27T01:23:56Z | Kenya | KE | Kiambu | Africa | -1.166667 | 36.833333 | [] |
ds[COL.LOAN_AMOUNT] = ds[COL.LOAN_AMOUNT].astype("float32")
ds[COL.FUNDED_AMOUNT] = ds[COL.FUNDED_AMOUNT].astype("float32")
ds[COL.RAISED_DATE] = cudf.to_datetime(ds[COL.RAISED_DATE], format="%Y-%m-%dT%H:%M:%SZ")
ds[COL.POSTED_DATE] = cudf.to_datetime(ds[COL.POSTED_DATE], format="%Y-%m-%dT%H:%M:%SZ")
ds[COL.COUNTRY] = ds[COL.COUNTRY].astype("category")
ds[COL.COUNTRY_NAME] = ds[COL.COUNTRY_NAME].astype("category")
ds[COL.REGION] = ds[COL.REGION].astype("category")
ds[COL.STATE] = ds[COL.STATE].astype("category")
ds[COL.LAT] = ds[COL.LAT].astype("float32")
ds[COL.LONG] = ds[COL.LONG].astype("float32")
ds.info()
<class 'cudf.core.dataframe.DataFrame'> Int64Index: 2564663 entries, 0 to 2564686 Data columns (total 11 columns): # Column Dtype --- ------ ----- 0 loanAmount float32 1 loanFundraisingInfo.fundedAmount float32 2 raisedDate datetime64[ns] 3 fundraisingDate datetime64[ns] 4 geocode.country.name category 5 geocode.country.isoCode category 6 geocode.state category 7 geocode.country.region category 8 geocode.latitude float32 9 geocode.longitude float32 10 tags list dtypes: category(4), datetime64[ns](2), float32(4), list(1) memory usage: 153.8 MB
ds.index.duplicated().sum()
array(0)
Preprocessing¶
We keep only the success loans¶
success = ds[COL.LOAN_AMOUNT] == ds[COL.FUNDED_AMOUNT]
counts = success.value_counts()
counts[True] / (counts[True] + counts[False]), len(ds)
(0.9564589967570788, 2564663)
# keep success only
ds = ds[success]
Drop some NaN¶
ds.isna().sum()
loanAmount 0 loanFundraisingInfo.fundedAmount 0 raisedDate 2 fundraisingDate 0 geocode.country.name 0 geocode.country.isoCode 0 geocode.state 300254 geocode.country.region 0 geocode.latitude 244728 geocode.longitude 244728 tags 0 dtype: int64
ds.dropna(subset=[COL.LOAN_AMOUNT, COL.FUNDED_AMOUNT, COL.POSTED_DATE, COL.RAISED_DATE], inplace=True)
Collection Speed¶
ds["funding_duration"] = ds[COL.RAISED_DATE] - ds[COL.POSTED_DATE]
ds["funding_duration_days"] = ds["funding_duration"].astype("int64") / pow(10, 9) # seconds
ds["funding_duration_days"] = ds["funding_duration_days"] / (24 * 60 * 60)
ds[COL.SPEED] = ds[COL.FUNDED_AMOUNT] / ds["funding_duration_days"]
ds.head()
| loanAmount | loanFundraisingInfo.fundedAmount | raisedDate | fundraisingDate | geocode.country.name | geocode.country.isoCode | geocode.state | geocode.country.region | geocode.latitude | geocode.longitude | tags | funding_duration | funding_duration_days | collection_speed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 24 | 75.0 | 75.0 | 2023-08-20 15:49:49 | 2023-08-20 15:30:50 | Philippines | PH | Eastern Visayas | Asia | 10.165355 | 124.84034 | [#Parent] | 0 days 00:18:59 | 0.013183 | 5689.201054 |
| 28 | 100.0 | 100.0 | 2023-08-20 16:06:26 | 2023-08-20 15:30:49 | Philippines | PH | Eastern Visayas | Asia | 10.165355 | 124.84034 | [volunteer_pick, volunteer_like] | 0 days 00:35:37 | 0.024734 | 4043.051006 |
| 47 | 100.0 | 100.0 | 2023-08-20 15:49:49 | 2023-08-20 15:10:11 | Philippines | PH | Eastern Visayas | Asia | 10.165355 | 124.84034 | [volunteer_pick, volunteer_like] | 0 days 00:39:38 | 0.027523 | 3633.305299 |
| 61 | 100.0 | 100.0 | 2023-08-20 15:54:58 | 2023-08-20 14:50:11 | Philippines | PH | Eastern Visayas | Asia | 10.165355 | 124.84034 | [#Health and Sanitation] | 0 days 01:04:47 | 0.044988 | 2222.793928 |
| 63 | 100.0 | 100.0 | 2023-08-20 15:34:06 | 2023-08-20 14:50:11 | Philippines | PH | Eastern Visayas | Asia | 10.165355 | 124.84034 | [] | 0 days 00:43:55 | 0.030498 | 3278.937381 |
There are some projects which are fullfilled before being published.
Let's show them and then get rid of them
# some project is already fulfilled before publish
ds[ds[COL.SPEED] < 0]
| loanAmount | loanFundraisingInfo.fundedAmount | raisedDate | fundraisingDate | geocode.country.name | geocode.country.isoCode | geocode.state | geocode.country.region | geocode.latitude | geocode.longitude | tags | funding_duration | funding_duration_days | collection_speed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 866326 | 125.0 | 125.0 | 2015-08-26 15:27:35 | 2015-08-26 18:59:15 | Kenya | KE | Africa | 1.0 | 38.0 | [] | -1 days +20:28:20 | -0.146991 | -850.393701 | |
| 870336 | 125.0 | 125.0 | 2015-08-13 20:58:07 | 2015-08-13 21:17:46 | Kenya | KE | Africa | 1.0 | 38.0 | [] | -1 days +23:40:21 | -0.013646 | -9160.305344 | |
| 875658 | 125.0 | 125.0 | 2015-07-29 14:35:38 | 2015-07-29 17:13:45 | Kenya | KE | Africa | 1.0 | 38.0 | [] | -1 days +21:21:53 | -0.109803 | -1138.399916 | |
| 885849 | 125.0 | 125.0 | 2015-06-23 09:52:47 | 2015-06-23 14:49:13 | Kenya | KE | Africa | 1.0 | 38.0 | [] | -1 days +19:03:34 | -0.205856 | -607.219161 | |
| 894007 | 125.0 | 125.0 | 2015-05-22 03:42:29 | 2015-05-22 04:32:53 | Kenya | KE | Africa | 1.0 | 38.0 | [] | -1 days +23:09:36 | -0.035000 | -3571.428571 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2564082 | 500.0 | 500.0 | 2005-03-31 06:27:55 | 2005-04-15 17:00:00 | Uganda | UG | Eastern Region | Africa | 0.75 | 34.08333206 | [] | -16 days +13:27:55 | -15.438947 | -32.385629 |
| 2564083 | 300.0 | 300.0 | 2005-03-31 06:27:55 | 2005-04-15 17:00:00 | Uganda | UG | Eastern Region | Africa | 0.75 | 34.08333206 | [user_favorite] | -16 days +13:27:55 | -15.438947 | -19.431377 |
| 2564084 | 500.0 | 500.0 | 2005-03-31 06:27:55 | 2005-04-15 17:00:00 | Uganda | UG | Eastern Region | Africa | 0.75 | 34.08333206 | [user_favorite] | -16 days +13:27:55 | -15.438947 | -32.385629 |
| 2564085 | 300.0 | 300.0 | 2005-03-31 06:27:55 | 2005-04-15 17:00:00 | Uganda | UG | Eastern Region | Africa | 0.75 | 34.08333206 | [user_favorite] | -16 days +13:27:55 | -15.438947 | -19.431377 |
| 2564086 | 500.0 | 500.0 | 2005-03-31 06:27:55 | 2005-04-15 17:00:00 | Uganda | UG | Eastern Region | Africa | 0.75 | 34.08333206 | [user_favorite] | -16 days +13:27:55 | -15.438947 | -32.385629 |
346 rows × 14 columns
ds = ds[ds[COL.SPEED] >= 0]
Encode tags using MultiLabelBinarizer¶
tagdf = ds[["tags"]].to_pandas()
tagdf.head()
| tags | |
|---|---|
| 24 | [#Parent] |
| 28 | [volunteer_pick, volunteer_like] |
| 47 | [volunteer_pick, volunteer_like] |
| 61 | [#Health and Sanitation] |
| 63 | [] |
from sklearn.preprocessing import MultiLabelBinarizer
lb = MultiLabelBinarizer()
mlb = lb.fit_transform(tagdf["tags"])
mlb.shape
(2452647, 65)
tags_columns = ["tag_" + i for i in lb.classes_]
tag_ds = cudf.DataFrame(mlb, columns=tags_columns, dtype="int8", index=tagdf.index)
del tagdf
tag_ds.sum()
tag_ 9
tag_#Agriculture 17
tag_#Animals 172147
tag_#BIPOC-owned Business 6686
tag_#Biz Durable Asset 104948
...
tag_reserved_crisis_support_loan 29
tag_user_favorite 695468
tag_user_like 1
tag_volunteer_like 42151
tag_volunteer_pick 75913
Length: 65, dtype: int64
tag_ds.columns
Index(['tag_', 'tag_#Agriculture', 'tag_#Animals', 'tag_#BIPOC-owned Business',
'tag_#Biz Durable Asset', 'tag_#COVID-19', 'tag_#CommunityImpact',
'tag_#Eco-friendly', 'tag_#EcoFriendly', 'tag_#Education',
'tag_#Elderly', 'tag_#Fabrics', 'tag_#Female Education',
'tag_#First Loan', 'tag_#GenderEquity', 'tag_#Health and Sanitation',
'tag_#Hidden Gem', 'tag_#Inspiring Story', 'tag_#Interesting Photo',
'tag_#Job Creator', 'tag_#Kaiser',
'tag_#Latinx/Hispanic-Owned Business', 'tag_#Low-profit FP',
'tag_#Married', 'tag_#NewBusiness', 'tag_#Orphan', 'tag_#Parent',
'tag_#Post-disbursed', 'tag_#Powerful Story', 'tag_#Refugee',
'tag_#Repair Renew Replace', 'tag_#Repeat Borrower', 'tag_#Schooling',
'tag_#Single', 'tag_#Single Parent', 'tag_#StandoutBackstory',
'tag_#Supporting Family', 'tag_#Sustainable Ag',
'tag_#TangibleProducts', 'tag_#Team Guys Holding Fish',
'tag_#Technology', 'tag_#Tourism', 'tag_#Trees',
'tag_#US Black-Owned Business', 'tag_#US Environmental Loan',
'tag_#US immigrant', 'tag_#Umpqua', 'tag_#Unique', 'tag_#Vegan',
'tag_#Widowed', 'tag_#Woman-Owned Business', 'tag_GoDaddy', 'tag_LGBTQ',
'tag_LISCChicago', 'tag_MUFG', 'tag_Salesforce', 'tag_US Refugee',
'tag_Viral', 'tag_beauty', 'tag_cow',
'tag_reserved_crisis_support_loan', 'tag_user_favorite',
'tag_user_like', 'tag_volunteer_like', 'tag_volunteer_pick'],
dtype='object')
# join with the original df
ds = ds.join(tag_ds)
del tag_ds
ds.head()
| loanAmount | loanFundraisingInfo.fundedAmount | raisedDate | fundraisingDate | geocode.country.name | geocode.country.isoCode | geocode.state | geocode.country.region | geocode.latitude | geocode.longitude | ... | tag_Salesforce | tag_US Refugee | tag_Viral | tag_beauty | tag_cow | tag_reserved_crisis_support_loan | tag_user_favorite | tag_user_like | tag_volunteer_like | tag_volunteer_pick | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 26655 | 1725.0 | 1725.0 | 2023-06-13 12:08:22 | 2023-05-23 20:40:05 | Vietnam | VN | <NA> | Asia | <NA> | <NA> | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 26656 | 2150.0 | 2150.0 | 2023-05-27 22:18:41 | 2023-05-23 20:40:05 | Vietnam | VN | Thanh Hoa | Asia | 19.97921181 | 106.0111237 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 26657 | 2975.0 | 2975.0 | 2023-06-15 18:26:53 | 2023-05-23 20:40:03 | Samoa | WS | <NA> | Oceania | <NA> | <NA> | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 26658 | 700.0 | 700.0 | 2023-06-22 14:36:29 | 2023-05-23 20:20:26 | Fiji | FJ | <NA> | Oceania | <NA> | <NA> | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 26659 | 150.0 | 150.0 | 2023-05-23 22:06:57 | 2023-05-23 20:20:25 | Madagascar | MG | Antananarivo Province | Africa | -19.65134239 | 47.32661438 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 79 columns
# sanity check here
tag_columns = [a for a in ds.columns if a.startswith("tag_")]
for testcase in range(50):
sam = ds.sample(1)
tags_list = sam["tags"].iloc[0]
for atag in tag_columns:
if atag.replace("tag_", "") in tags_list:
assert sam[atag].iloc[0] == 1
else:
assert sam[atag].iloc[0] == 0
# drop the orignal `tags` columns
ds.drop(["tags"], axis=1, inplace=True)
Quickly refine tags¶
merge 'tag_#Eco-friendly' and 'tag_#EcoFriendly'¶
ds["tag_#EcoFriendly"] = ((ds["tag_#Eco-friendly"] + ds["tag_#EcoFriendly"]) > 0).astype("int8")
ds.drop("tag_#Eco-friendly", axis=1, inplace=True)
Keep tags visibled to users¶
Take a look at this screenshot. We notice that some tags in the dataframe are not display in the website

These undisplayed tags are tag_user_favorite, tag_user_like, tag_volunteer_like, tag_volunteer_pick. It might because those are used internally in the kiva platform
And because those tags are not being shown to Lender, we could get rid of them here
# drop some meaningless tags
ds.drop(
["tag_", "tag_user_favorite", "tag_user_like", "tag_volunteer_like", "tag_volunteer_pick"], axis=1, inplace=True
)
ds.info()
<class 'cudf.core.dataframe.DataFrame'> Int64Index: 2452647 entries, 26655 to 2564545 Data columns (total 72 columns): # Column Dtype --- ------ ----- 0 loanAmount float32 1 loanFundraisingInfo.fundedAmount float32 2 raisedDate datetime64[ns] 3 fundraisingDate datetime64[ns] 4 geocode.country.name category 5 geocode.country.isoCode category 6 geocode.state category 7 geocode.country.region category 8 geocode.latitude float32 9 geocode.longitude float32 10 funding_duration timedelta64[ns] 11 funding_duration_days float64 12 collection_speed float64 13 tag_#Agriculture int8 14 tag_#Animals int8 15 tag_#BIPOC-owned Business int8 16 tag_#Biz Durable Asset int8 17 tag_#COVID-19 int8 18 tag_#CommunityImpact int8 19 tag_#EcoFriendly int8 20 tag_#Education int8 21 tag_#Elderly int8 22 tag_#Fabrics int8 23 tag_#Female Education int8 24 tag_#First Loan int8 25 tag_#GenderEquity int8 26 tag_#Health and Sanitation int8 27 tag_#Hidden Gem int8 28 tag_#Inspiring Story int8 29 tag_#Interesting Photo int8 30 tag_#Job Creator int8 31 tag_#Kaiser int8 32 tag_#Latinx/Hispanic-Owned Business int8 33 tag_#Low-profit FP int8 34 tag_#Married int8 35 tag_#NewBusiness int8 36 tag_#Orphan int8 37 tag_#Parent int8 38 tag_#Post-disbursed int8 39 tag_#Powerful Story int8 40 tag_#Refugee int8 41 tag_#Repair Renew Replace int8 42 tag_#Repeat Borrower int8 43 tag_#Schooling int8 44 tag_#Single int8 45 tag_#Single Parent int8 46 tag_#StandoutBackstory int8 47 tag_#Supporting Family int8 48 tag_#Sustainable Ag int8 49 tag_#TangibleProducts int8 50 tag_#Team Guys Holding Fish int8 51 tag_#Technology int8 52 tag_#Tourism int8 53 tag_#Trees int8 54 tag_#US Black-Owned Business int8 55 tag_#US Environmental Loan int8 56 tag_#US immigrant int8 57 tag_#Umpqua int8 58 tag_#Unique int8 59 tag_#Vegan int8 60 tag_#Widowed int8 61 tag_#Woman-Owned Business int8 62 tag_GoDaddy int8 63 tag_LGBTQ int8 64 tag_LISCChicago int8 65 tag_MUFG int8 66 tag_Salesforce int8 67 tag_US Refugee int8 68 tag_Viral int8 69 tag_beauty int8 70 tag_cow int8 71 tag_reserved_crisis_support_loan int8 dtypes: category(4), datetime64[ns](2), float32(4), float64(2), int8(59), timedelta64[ns](1) memory usage: 300.3 MB
ds.isna().sum().sort_values()
loanAmount 0
loanFundraisingInfo.fundedAmount 0
raisedDate 0
fundraisingDate 0
geocode.country.name 0
...
tag_cow 0
tag_reserved_crisis_support_loan 0
geocode.latitude 244711
geocode.longitude 244711
geocode.state 300236
Length: 72, dtype: int64
Now drawing¶
Tags vs time¶
time_df = ds[[COL.POSTED_DATE, COL.RAISED_DATE]].to_pandas()
time_df["date"] = time_df.progress_apply(
lambda row: list(pd.date_range(row[COL.POSTED_DATE], row[COL.RAISED_DATE])), axis=1
)
time_ds = cudf.from_pandas(time_df)
del time_df
time_ds.head()
100%|██████████| 2452647/2452647 [06:42<00:00, 6093.39it/s]
| fundraisingDate | raisedDate | date | |
|---|---|---|---|
| 26655 | 2023-05-23 20:40:05 | 2023-06-13 12:08:22 | [2023-05-23T20:40:05.000000, 2023-05-24T20:40:... |
| 26656 | 2023-05-23 20:40:05 | 2023-05-27 22:18:41 | [2023-05-23T20:40:05.000000, 2023-05-24T20:40:... |
| 26657 | 2023-05-23 20:40:03 | 2023-06-15 18:26:53 | [2023-05-23T20:40:03.000000, 2023-05-24T20:40:... |
| 26658 | 2023-05-23 20:20:26 | 2023-06-22 14:36:29 | [2023-05-23T20:20:26.000000, 2023-05-24T20:20:... |
| 26659 | 2023-05-23 20:20:25 | 2023-05-23 22:06:57 | [2023-05-23T20:20:25.000000] |
time_ds = ds.merge(time_ds, left_index=True, right_index=True)
time_ds.head()
| loanAmount | loanFundraisingInfo.fundedAmount | raisedDate_x | fundraisingDate_x | geocode.country.name | geocode.country.isoCode | geocode.state | geocode.country.region | geocode.latitude | geocode.longitude | ... | tag_MUFG | tag_Salesforce | tag_US Refugee | tag_Viral | tag_beauty | tag_cow | tag_reserved_crisis_support_loan | fundraisingDate_y | raisedDate_y | date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4630 | 250.0 | 250.0 | 2023-08-04 01:36:00 | 2023-08-02 02:20:07 | Sierra Leone | SL | <NA> | Africa | <NA> | <NA> | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-08-02 02:20:07 | 2023-08-04 01:36:00 | [2023-08-02T02:20:07.000000, 2023-08-03T02:20:... |
| 4631 | 4150.0 | 4150.0 | 2023-08-12 16:24:47 | 2023-08-02 02:20:04 | Paraguay | PY | Cordillera | South America | -25.38333321 | -57.15000153 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-08-02 02:20:04 | 2023-08-12 16:24:47 | [2023-08-02T02:20:04.000000, 2023-08-03T02:20:... |
| 4632 | 1000.0 | 1000.0 | 2023-08-20 01:23:09 | 2023-08-02 02:00:11 | Cambodia | KH | Siem Reap | Asia | 13.36666679 | 103.8499985 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-08-02 02:00:11 | 2023-08-20 01:23:09 | [2023-08-02T02:00:11.000000, 2023-08-03T02:00:... |
| 4634 | 200.0 | 200.0 | 2023-08-02 07:09:51 | 2023-08-02 02:00:10 | Indonesia | ID | Banten | Asia | -6.748270512 | 105.688179 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-08-02 02:00:10 | 2023-08-02 07:09:51 | [2023-08-02T02:00:10.000000] |
| 4636 | 175.0 | 175.0 | 2023-08-02 10:57:40 | 2023-08-02 02:00:09 | Indonesia | ID | Banten | Asia | -6.178055763 | 106.6299973 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-08-02 02:00:09 | 2023-08-02 10:57:40 | [2023-08-02T02:00:09.000000] |
5 rows × 75 columns
time_ds = time_ds.explode("date")
time_ds["date"] = time_ds["date"].dt.floor("D")
time_ds.head()
| loanAmount | loanFundraisingInfo.fundedAmount | raisedDate_x | fundraisingDate_x | geocode.country.name | geocode.country.isoCode | geocode.state | geocode.country.region | geocode.latitude | geocode.longitude | ... | tag_MUFG | tag_Salesforce | tag_US Refugee | tag_Viral | tag_beauty | tag_cow | tag_reserved_crisis_support_loan | fundraisingDate_y | raisedDate_y | date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4630 | 250.0 | 250.0 | 2023-08-04 01:36:00 | 2023-08-02 02:20:07 | Sierra Leone | SL | <NA> | Africa | <NA> | <NA> | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-08-02 02:20:07 | 2023-08-04 01:36:00 | 2023-08-02 |
| 4630 | 250.0 | 250.0 | 2023-08-04 01:36:00 | 2023-08-02 02:20:07 | Sierra Leone | SL | <NA> | Africa | <NA> | <NA> | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-08-02 02:20:07 | 2023-08-04 01:36:00 | 2023-08-03 |
| 4631 | 4150.0 | 4150.0 | 2023-08-12 16:24:47 | 2023-08-02 02:20:04 | Paraguay | PY | Cordillera | South America | -25.38333321 | -57.15000153 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-08-02 02:20:04 | 2023-08-12 16:24:47 | 2023-08-02 |
| 4631 | 4150.0 | 4150.0 | 2023-08-12 16:24:47 | 2023-08-02 02:20:04 | Paraguay | PY | Cordillera | South America | -25.38333321 | -57.15000153 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-08-02 02:20:04 | 2023-08-12 16:24:47 | 2023-08-03 |
| 4631 | 4150.0 | 4150.0 | 2023-08-12 16:24:47 | 2023-08-02 02:20:04 | Paraguay | PY | Cordillera | South America | -25.38333321 | -57.15000153 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-08-02 02:20:04 | 2023-08-12 16:24:47 | 2023-08-04 |
5 rows × 75 columns
tag_columns = [a for a in ds.columns if a.startswith("tag_")]
tag_counts = []
for atag in tqdm(tag_columns):
temp = time_ds[time_ds[atag] == 1].date.value_counts().rename(atag)
tag_counts.append(temp)
tag_counts_concat = cudf.concat(tag_counts, axis=1)
tag_counts_concat
100%|██████████| 59/59 [00:01<00:00, 35.69it/s]
| tag_#Agriculture | tag_#Animals | tag_#BIPOC-owned Business | tag_#Biz Durable Asset | tag_#COVID-19 | tag_#CommunityImpact | tag_#EcoFriendly | tag_#Education | tag_#Elderly | tag_#Fabrics | ... | tag_GoDaddy | tag_LGBTQ | tag_LISCChicago | tag_MUFG | tag_Salesforce | tag_US Refugee | tag_Viral | tag_beauty | tag_cow | tag_reserved_crisis_support_loan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2008-01-29 | <NA> | 1 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 2009-07-03 | <NA> | 5 | <NA> | 1 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 2009-07-04 | <NA> | 5 | <NA> | 1 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 2009-07-05 | <NA> | 5 | <NA> | 1 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 2009-07-06 | <NA> | 4 | <NA> | 1 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2023-08-18 | <NA> | 170 | 9 | 70 | <NA> | 1 | 139 | <NA> | 181 | 47 | ... | <NA> | 2 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 2023-08-19 | <NA> | 76 | 5 | 36 | <NA> | <NA> | 52 | <NA> | 80 | 23 | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 2023-08-20 | <NA> | 32 | 4 | 13 | <NA> | <NA> | 27 | <NA> | 40 | 12 | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 2023-08-21 | <NA> | 6 | 3 | 5 | <NA> | <NA> | 8 | <NA> | 4 | 2 | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 2023-08-22 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 1 | 1 | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
4377 rows × 59 columns
date_range = list(pd.date_range(ds[COL.POSTED_DATE].min(), ds[COL.RAISED_DATE].max(), normalize=True))
tag_hist = cudf.DataFrame({"date": date_range})
tag_hist = tag_hist.set_index("date")
tag_hist = tag_hist.merge(tag_counts_concat, left_index=True, right_index=True, how="outer")
tag_hist_display = tag_hist.reset_index().melt(id_vars=["date"], var_name="tag")
tag_hist_display.dropna(inplace=True)
tag_hist_display = tag_hist_display.to_pandas()
tag_hist_display
| date | tag | value | |
|---|---|---|---|
| 1761 | 2022-10-06 | tag_#Agriculture | 1 |
| 1814 | 2022-08-08 | tag_#Agriculture | 1 |
| 1815 | 2022-08-09 | tag_#Agriculture | 1 |
| 1816 | 2022-08-10 | tag_#Agriculture | 1 |
| 2174 | 2023-01-23 | tag_#Agriculture | 1 |
| ... | ... | ... | ... |
| 373734 | 2020-06-07 | tag_reserved_crisis_support_loan | 13 |
| 373735 | 2020-06-08 | tag_reserved_crisis_support_loan | 13 |
| 373736 | 2020-06-09 | tag_reserved_crisis_support_loan | 13 |
| 373737 | 2020-06-01 | tag_reserved_crisis_support_loan | 12 |
| 373738 | 2020-06-05 | tag_reserved_crisis_support_loan | 13 |
122027 rows × 3 columns
fig = px.histogram(
tag_hist_display, x="date", y="value", color="tag", barmode="overlay", opacity=0.3, histnorm="percent", height=800
)
fig.show()
from above figure, we could see that some tags are only happend in a short timeframe, e.g
#MarriedPost-disbursed???SalesforcebeautyViralMUFGreversed_crisis_support_loan
Number of project vs tag¶
tag_columns = [a for a in ds.columns if a.startswith("tag_")]
class My:
FIG_H = 800
FIG_W = None
project_count_per_tag = ds[tag_columns].sum().sort_values(ascending=False)
project_count_per_tag = project_count_per_tag.to_pandas()
project_count_per_tag.rename("number of project", inplace=True)
fig = px.bar(
project_count_per_tag.head(20),
orientation="v",
text_auto=True,
title="Number of Projects per Tag",
height=My.FIG_H,
width=My.FIG_W,
labels={"x": "Categories", "y": "Number of Loans"},
)
fig.update_traces(showlegend=False)
fig.update_layout(xaxis_title=None, yaxis_title="Number of Projects")
fig.show()
Number of project vs country¶
proj_per_country = ds.groupby(by=["geocode.country.name"]).count()["loanAmount"].sort_values(ascending=False)
proj_per_country = proj_per_country.to_pandas()
proj_per_country.rename("number of project", inplace=True)
fig = px.bar(
proj_per_country.head(20),
orientation="v",
text_auto=True,
title="Number of Projects per Country",
height=My.FIG_H,
width=My.FIG_W,
labels={"x": "Categories", "y": "Values"},
)
fig.update_traces(showlegend=False)
fig.update_layout(xaxis_title="Country", yaxis_title="Number of Projects")
fig.show()
Collection Speed vs Tag¶
def get_tag_performance(_df: pd.DataFrame, num_tag: int = 10) -> pd.DataFrame:
"""get speed performance by tags, keep only first `num_tag`"""
tags_performances = []
for atag in tag_columns:
mean = _df[_df[atag] == 1][COL.SPEED].mean()
std = _df[_df[atag] == 1][COL.SPEED].std()
count = _df[_df[atag] == 1][COL.SPEED].count()
tags_performances.append({"tag": atag, "speed_mean": mean, "speed_std": std, "count": count})
tags_performances = pd.DataFrame(tags_performances)
tags_performances.dropna(subset=["speed_mean"], inplace=True)
tags_performances.fillna(0, inplace=True)
tags_performances.sort_values("speed_mean", inplace=True, ascending=False)
tags_performances.set_index("tag", inplace=True)
return tags_performances.head(num_tag)
# fig = px.bar(tags_performances, y="tag", x="mean", error_x="std", text_auto=True)
fig = px.bar(
get_tag_performance(ds, 20),
y="speed_mean",
text_auto=True,
title="Collection Speed Mean per Tag",
width=My.FIG_W,
height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()
fig = px.bar(
get_tag_performance(ds, 20),
y="speed_mean",
text_auto=True,
title="Collection Speed Mean per Tag",
width=My.FIG_W,
height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()
Colection speed vs Tags for Countries¶
"""convert coutry code to country name"""
code_to_name = ds[[COL.COUNTRY_NAME, COL.COUNTRY]].drop_duplicates()
code_to_name.set_index(COL.COUNTRY, inplace=True)
code_to_name = code_to_name.to_dict()[COL.COUNTRY_NAME]
assert code_to_name["VN"] == "Vietnam"
country_code = "VN"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
get_tag_performance(vn_df),
y="speed_mean",
text_auto=True,
title=f"Mean Collection Speed for {code_to_name[country_code]}",
width=My.FIG_W,
height=My.FIG_H,
)
fig.update_traces(textfont_size=26, textangle=0, textposition="outside", cliponaxis=False)
fig.show()
country_code = "KE"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
get_tag_performance(vn_df),
y="speed_mean",
text_auto=True,
title=f"Mean Collection Speed for {code_to_name[country_code]}",
width=My.FIG_W,
height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()
country_code = "KH"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
get_tag_performance(vn_df),
y="speed_mean",
text_auto=True,
title=f"Mean Collection Speed for {code_to_name[country_code]}",
width=My.FIG_W,
height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()
country_code = "PK"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
get_tag_performance(vn_df),
y="speed_mean",
text_auto=True,
title=f"Mean Collection Speed for {code_to_name[country_code]}",
width=My.FIG_W,
height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()
country_code = "SV"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
get_tag_performance(vn_df),
y="speed_mean",
text_auto=True,
title=f"Mean Collection Speed for {code_to_name[country_code]}",
width=My.FIG_W,
height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()
Correlation between tags and collection speed
tag_df = ds[tag_columns].to_pandas()
corr = tag_df.corrwith(ds[COL.SPEED].to_pandas(), method="kendall", drop=True)
corr.rename("correlation", inplace=True)
fig = px.bar(
corr.sort_values(ascending=False),
orientation="h",
text_auto=True,
title="Correlation betwene Tags and Collection Speed",
height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.update_layout(xaxis_title="kendall correlation score", yaxis_title=None)
fig.update_traces(showlegend=False)
fig.show()
Influence of the number of tags¶
Above-average speed vs number of tags
tag_count = ds[tag_columns].sum(axis=1)
tag_count_df = tag_count.value_counts()
tag_count_mean = tag_count.mean()
tag_count_std = tag_count.std()
fig = px.bar(
tag_count_df.to_pandas(),
text_auto=True,
title="Distribution of Number of Tags per Project",
width=My.FIG_W,
height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.update_xaxes(tickmode="linear")
fig.update_layout(xaxis_title="Number of Tags", yaxis_title="Number of Project")
fig.update_traces(showlegend=False)
fig.add_vline(x=tag_count_mean)
fig.show()
# Number of above-average speed collection vs number of tags
speed_mean = ds[COL.SPEED].mean()
is_above_average = ds[COL.SPEED] >= speed_mean
tag_count_df = tag_count.to_frame(name="tag_count").join(is_above_average.rename("is_above_average"))
mn = tag_count_df.groupby("tag_count").agg(["sum", "count"])
mn["percentage"] = mn["is_above_average"]["sum"] / mn["is_above_average"]["count"] * 100
fig = px.bar(
mn["percentage"],
text_auto=True,
title="Projects with above-average collection speed based on the number of tags",
labels={"value": "Percentage of Project with Collection Speed above Global Average"},
height=My.FIG_H,
)
# fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.update_traces(showlegend=False)
fig.update_xaxes(tickmode="linear")
fig.show()
Effectiveness of Tags on Top 5% Collection Speed
Pair of tags Performance¶
from itertools import combinations
tag2_columns = []
for apair in combinations(tag_columns, 2):
col_name = "__".join(apair)
tag2_columns.append(col_name)
ds[col_name] = ds[apair[0]] & ds[apair[1]]
assert ds[tag2_columns].max().max() == 1
assert ds[tag2_columns].min().min() == 0
speed_mean = ds[COL.SPEED].mean()
above_average = ds[ds[COL.SPEED] >= speed_mean]
tag2_performances = []
for apair in tqdm(tag2_columns):
temp = above_average[apair] * above_average[COL.SPEED]
mean = temp.mean()
std = temp.std()
tag2_performances.append({"tag": apair, "mean": mean, "std": std})
100%|██████████| 1711/1711 [00:01<00:00, 922.80it/s]
la = pd.DataFrame(tag2_performances)
la = la.dropna(subset=["mean"]).sort_values("mean", ascending=True)
la = la.tail(20)
la
| tag | mean | std | |
|---|---|---|---|
| 451 | tag_#Elderly__tag_#Parent | 61.412328 | 5738.995627 |
| 1123 | tag_#Parent__tag_#Single Parent | 61.426469 | 2442.729082 |
| 1375 | tag_#Single Parent__tag_#Woman-Owned Business | 65.372891 | 2354.055291 |
| 1645 | tag_#Widowed__tag_#Woman-Owned Business | 72.374097 | 2441.126271 |
| 1322 | tag_#Schooling__tag_#Woman-Owned Business | 73.109297 | 4022.540638 |
| 1634 | tag_#Vegan__tag_#Woman-Owned Business | 74.000523 | 5939.475894 |
| 1276 | tag_#Repeat Borrower__tag_#Schooling | 80.544568 | 3847.419643 |
| 80 | tag_#Animals__tag_#Parent | 101.176272 | 2248.550738 |
| 364 | tag_#EcoFriendly__tag_#Technology | 103.516060 | 1141.510431 |
| 524 | tag_#Fabrics__tag_#Woman-Owned Business | 109.132849 | 3779.977944 |
| 339 | tag_#EcoFriendly__tag_#Health and Sanitation | 109.789913 | 1069.773658 |
| 64 | tag_#Animals__tag_#Elderly | 110.556436 | 3086.292794 |
| 1121 | tag_#Parent__tag_#Schooling | 114.049577 | 4360.514412 |
| 104 | tag_#Animals__tag_#Woman-Owned Business | 234.278497 | 4059.329960 |
| 85 | tag_#Animals__tag_#Repeat Borrower | 234.753517 | 4337.145020 |
| 1120 | tag_#Parent__tag_#Repeat Borrower | 353.527412 | 8256.977832 |
| 456 | tag_#Elderly__tag_#Repeat Borrower | 401.836528 | 9297.254081 |
| 475 | tag_#Elderly__tag_#Woman-Owned Business | 473.375115 | 8582.458403 |
| 1139 | tag_#Parent__tag_#Woman-Owned Business | 738.201803 | 10952.096874 |
| 1294 | tag_#Repeat Borrower__tag_#Woman-Owned Business | 888.668345 | 11548.088531 |
fig = px.bar(la, y="tag", x="mean", title="Effectiveness of Tags on Collection Speed", width=My.FIG_W, height=My.FIG_H)
# fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
# fig.update_xaxes(tickmode='linear')
# fig.add_vline(x=tag_count_mean)
fig.show()
Save the results¶
# convert to html to easiy read
!jupyter nbconvert --to html 1_data_overview.ipynb
[NbConvertApp] Converting notebook 1_data_overview.ipynb to html [NbConvertApp] Writing 390204 bytes to 1_data_overview.html